

***** Clean Raw Gazole Data *****
foreach i of numlist 2007/2018 {
clear 
import delimited "$path\Raw Data\Base_prix`i'_sp95.csv", delim(";") varnames(1)

drop v1
drop if id_pdv=="NA" | date_ma=="NA" | id_carburant=="NA"  | prix=="NA"
destring prix, replace force


gen jour_maj=substr(date_maj,1,10)
gen heure_maj=substr(date_maj,12,8)
drop date_maj

*Generate a variable indicating hours
capture drop ts
egen str ts=concat(jour_maj heure_maj), punct(" ")
capture drop dt
gen double dt = clock(ts, "YMD hms")
format dt %tc


gen annee=substr(jour_maj,1,4)
gen mois=substr(jour_maj,6,2)
gen jour=substr(jour_maj,9,2)
destring annee, replace force
destring mois, replace force
destring jour, replace force

gen date_maj = mdy(mois, jour, annee)
format date_maj %d
drop jour_maj annee mois jour
drop heure_maj
drop ts
rename dt full_date_maj


save "$path\Intermediary Data\Base_prix`i'_sp95_toappend.dta", replace
}


// Append all
clear
use "$path\Intermediary Data\Base_prix2007_sp95_toappend.dta"
append using "$path\Intermediary Data\Base_prix2008_sp95_toappend.dta"
append using "$path\Intermediary Data\Base_prix2009_sp95_toappend.dta"
append using "$path\Intermediary Data\Base_prix2010_sp95_toappend.dta"
append using "$path\Intermediary Data\Base_prix2011_sp95_toappend.dta"
append using "$path\Intermediary Data\Base_prix2012_sp95_toappend.dta"
append using "$path\Intermediary Data\Base_prix2013_sp95_toappend.dta"
append using "$path\Intermediary Data\Base_prix2014_sp95_toappend.dta"
append using "$path\Intermediary Data\Base_prix2015_sp95_toappend.dta"
append using "$path\Intermediary Data\Base_prix2016_sp95_toappend.dta"
append using "$path\Intermediary Data\Base_prix2017_sp95_toappend.dta"
append using "$path\Intermediary Data\Base_prix2018_sp95_toappend.dta"

// Transform tax-included prices into tax-excluded prices
do "$path\Codes\Prepare_IntermediaryData\02_TICPE.do"

capture drop num_obs
capture drop first_obs


sort id_pdv full_date_maj
capture drop num_obs
gen num_obs=_n

// 0. Preliminary: identify cases where same observations on same exact hour

*** First case: same exact price and hour: drop duplicates
duplicates drop id_pdv full_date_maj prix, force

*** Second case: same exact hour but not same price --> keep highest value
duplicates tag id_pdv full_date_maj, gen(tag)
bysort id_pdv full_date_maj: egen prix_max=max(prix)
drop if prix!=prix_max & tag!=0 & tag!=.
capture drop tag
capture drop prix_max
duplicates drop id_pdv full_date_maj, force /* Last duplicates drop for check: nothing should be dropped */

// 1. Suppress "too low" price levels which correspond to "wrong" updates 
xtile xt_prix_ht=prix_ht, nq(200)
drop if xt_prix_ht==200 | xt_prix_ht==1
capture drop xt_prix_ht


** Keep only one observation per price, namely the first one
capture drop same_price_asbefore
bysort id_pdv (num_obs): gen same_price_asbefore=1 if prix==prix[_n-1]
keep if same_price_asbefore==.
drop same_price_asbefore

*** Keep one observation per day, namely the last one 
capture drop same_price_asbefore
capture drop same_day_asafter
bysort id_pdv (num_obs): gen same_day_asafter=1 if date_maj==date_maj[_n+1]
drop if same_day_asafter==1
capture drop same_day_asafter

// 2. Drop outliers in terms of stations

// 2.1 - Drop stations depending on median time interval between two changes
bysort id_pdv (num_obs): gen timeinterval_lastchange=date_maj-date_maj[_n-1]
bysort id_pdv: egen median_timeinterval=median(timeinterval_lastchange)

** Check deciles 
bysort id_pdv (num_obs): gen first_obs=1 if _n==1
summ median_timeinterval if first_obs==1, detail
summ timeinterval_lastchange, detail

drop if median_timeinterval>365 
drop if median_timeinterval>182 
drop if median_timeinterval>90 
drop if median_timeinterval>30 
drop if median_timeinterval>12 

drop median_timeinterval


// 2.2 - Drop stations staying less than 6 months in the sample

bysort id_pdv: egen min_day_maj=min(date_maj)
bysort id_pdv: egen max_day_maj=max(date_maj)
gen number_days_insample=max_day_maj-min_day_maj

drop if number_days_insample<30 // 246 observations deleted
drop if number_days_insample<60 // 479 observations deleted
drop if number_days_insample<90 // 495 observations deleted
drop if number_days_insample<120 // 676 observations deleted
drop if number_days_insample<160 // 827 observations deleted
drop if number_days_insample<180 // 509 observations deleted

drop min_day_maj
drop max_day_maj 


// 3. If duration between two price changes is greater than 15/30/60 days, no price change is computed and a new identifier is created

* 3.1 Create a new identifier

capture drop number_changes_othertypes
bysort id_pdv (num_obs): gen number_changes_othertypes=num_obs-num_obs[_n-1]-1

capture drop tag_sup30
gen tag_sup30=0
replace tag_sup30=1 if timeinterval_lastchange>30 & timeinterval_lastchange!=. 

capture drop tag_sup15
gen tag_sup15=0
replace tag_sup15=1 if timeinterval_lastchange>15 & timeinterval_lastchange!=. 

capture drop tag_sup60
gen tag_sup60=0
replace tag_sup60=1 if timeinterval_lastchange>60 & timeinterval_lastchange!=. 




**** Generate a new ID equal to ID_PDV + Num_Obs

egen id_pdv_15_temp=concat(id_pdv num_obs) if tag_sup15==1, punct("_")
by id_pdv: carryforward id_pdv_15_temp, gen(id_pdv_15)
replace id_pdv_15=id_pdv if id_pdv_15==""
capture drop id_pdv_15_temp

egen id_pdv_30_temp=concat(id_pdv num_obs) if tag_sup30==1, punct("_")
by id_pdv: carryforward id_pdv_30_temp, gen(id_pdv_30)
replace id_pdv_30=id_pdv if id_pdv_30==""
capture drop id_pdv_30_temp

egen id_pdv_60_temp=concat(id_pdv num_obs) if tag_sup60==1, punct("_")
by id_pdv: carryforward id_pdv_60_temp, gen(id_pdv_60)
replace id_pdv_60=id_pdv if id_pdv_60==""
capture drop id_pdv_60_temp


bysort id_pdv_15: egen min_day_maj15=min(date_maj)
bysort id_pdv_15: egen max_day_maj15=max(date_maj)
gen number_days_insample15=max_day_maj15-min_day_maj15

bysort id_pdv_30: egen min_day_maj30=min(date_maj)
bysort id_pdv_30: egen max_day_maj30=max(date_maj)
gen number_days_insample30=max_day_maj30-min_day_maj30

bysort id_pdv_60: egen min_day_maj60=min(date_maj)
bysort id_pdv_60: egen max_day_maj60=max(date_maj)
gen number_days_insample60=max_day_maj60-min_day_maj60

bysort id_pdv_15 (num_obs): gen first_obs15=1 if _n==1
bysort id_pdv_30 (num_obs): gen first_obs30=1 if _n==1
bysort id_pdv_60 (num_obs): gen first_obs60=1 if _n==1




keep date_maj prix prix_ht id_pdv_30 num_obs

egen id_pdv_30num=group(id_pdv_30)

gen date_maj_num=date_maj
drop date_maj
drop num_obs

xtset id_pdv_30num date_maj_num

tsfill


by id_pdv_30num: carryforward prix, replace
by id_pdv_30num: carryforward prix_ht, replace
by id_pdv_30num: carryforward id_pdv_30, replace


save "$path\Intermediary Data\Prix_SP9530_Level_DailyFULL.dta", replace


